Explore and Interpretation of data variables to determine further
data concentration.
Coordinate (Latitude and Longitude)
range(hotspots$lat, na.rm = TRUE) #range is valid
## [1] 48.32883 60.00090
range(hotspots$lon, na.rm = TRUE) #range is valid
## [1] -138.328 -114.090
# British Columbia coordinate :
# Latitude: Between 48.309789°N and 60.00065°N
# Longitude: Between -139.058200 °W and -114.05423°W
Rep_date (Date range)
# Extract year and number of points from event_details
events_count <- event_details %>%
select(year, events_count)
# Check the extracted table
print(events_count)
## # A tibble: 10 × 2
## year events_count
## <dbl> <int>
## 1 2014 1786
## 2 2015 2035
## 3 2016 636
## 4 2017 4293
## 5 2018 5433
## 6 2019 1792
## 7 2020 699
## 8 2021 4437
## 9 2022 2035
## 10 2023 9163
The period of data covers 10 years period started from 2014 and
included the major fire events in 2023.
Show the summary of hotspots dataset in days count per year
# Create summary table describing each hotspots dataset by year
hotspots_df_summary <- hotspots %>%
group_by(year) %>%
summarise(
start_date = min(rep_date),
end_date = max(rep_date),
hotspots_df_day_count = as.numeric(difftime(max(rep_date), min(rep_date), units = "days"))
)
hotspots_df_summary
## # A tibble: 10 × 4
## year start_date end_date hotspots_df_day_count
## <dbl> <dttm> <dttm> <dbl>
## 1 2014 2014-04-04 18:50:00 2014-12-07 20:37:00 247.
## 2 2015 2015-03-07 06:05:00 2015-11-22 21:23:00 261.
## 3 2016 2016-03-25 04:10:00 2016-12-21 04:55:00 271.
## 4 2017 2017-03-03 04:15:00 2017-11-15 11:41:00 257.
## 5 2018 2018-01-06 03:11:00 2018-11-07 18:19:00 306.
## 6 2019 2019-03-13 02:06:00 2019-11-19 18:54:00 252.
## 7 2020 2020-03-03 03:45:00 2020-12-29 05:05:00 301.
## 8 2021 2021-01-06 13:35:00 2021-12-11 18:36:00 339.
## 9 2022 2022-01-11 13:22:00 2022-12-31 19:06:00 354.
## 10 2023 2023-01-01 05:01:00 2023-12-31 19:14:00 365.
# Create summary table describing each hotspots dataset by year
hotspots_df_summary <- hotspots_df_summary %>%
left_join(events_count, by = "year") %>%
mutate(
start_month_day = format(as.Date(start_date), "%m-%d"),
end_month_day = format(as.Date(end_date), "%m-%d")
)
print(hotspots_df_summary)
## # A tibble: 10 × 7
## year start_date end_date hotspots_df_day_count
## <dbl> <dttm> <dttm> <dbl>
## 1 2014 2014-04-04 18:50:00 2014-12-07 20:37:00 247.
## 2 2015 2015-03-07 06:05:00 2015-11-22 21:23:00 261.
## 3 2016 2016-03-25 04:10:00 2016-12-21 04:55:00 271.
## 4 2017 2017-03-03 04:15:00 2017-11-15 11:41:00 257.
## 5 2018 2018-01-06 03:11:00 2018-11-07 18:19:00 306.
## 6 2019 2019-03-13 02:06:00 2019-11-19 18:54:00 252.
## 7 2020 2020-03-03 03:45:00 2020-12-29 05:05:00 301.
## 8 2021 2021-01-06 13:35:00 2021-12-11 18:36:00 339.
## 9 2022 2022-01-11 13:22:00 2022-12-31 19:06:00 354.
## 10 2023 2023-01-01 05:01:00 2023-12-31 19:14:00 365.
## # ℹ 3 more variables: events_count <int>, start_month_day <chr>,
## # end_month_day <chr>
This table shows the beginning and end day of data
collected by yearly basis with number of fire event counts. The notable
finding is started from 2021 to 2023, it shows the data recorded in full
year round, making it has more data point reference and can increase
accuracy of analysis.
UID (Identifier)
# Display the first 10 rows as preview
hotspots %>%
select(uid) %>%
head(10) # Display the first 10 rows
## # A tibble: 10 × 1
## uid
## <dbl>
## 1 5097641
## 2 5349891
## 3 5470838
## 4 5048694
## 5 4073541
## 6 5198198
## 7 5303682
## 8 5537351
## 9 5296999
## 10 4073857
# The range of UID
range(hotspots$uid, na.rm = TRUE)
## [1] 31324 44237954
# The uid variable is present in historical datasets up to 2019 and absent for the rest of the year hence 40% value is missing.
Source (Source of hotspots)
hotspots %>%
select(source) %>%
count(source)
## # A tibble: 15 × 2
## source n
## <chr> <int>
## 1 HMS 4158
## 2 NASA 232234
## 3 NASA1 5013
## 4 NASA2 339423
## 5 NASA3 330797
## 6 NASA6 5091
## 7 NASA7 5410
## 8 NASA_can 12899
## 9 NASA_usa 435
## 10 NASAkmz 89
## 11 NASAwcan 49947
## 12 NASAwusa 143
## 13 NOAA 172479
## 14 UMD 28514
## 15 USFS 589933
sourse_cluster_count <- hotspots %>%
group_by(source) %>%
summarise(num_clusters = n_distinct(event_cluster))
# Create a bar plot of the number of clusters per satellite
ggplot(sourse_cluster_count, aes(x = source, y = num_clusters)) +
geom_bar(stat = "identity", fill = "steelblue") +
labs(title = "Number of Events Reported by Each Source",
x = "Source",
y = "Number of Events (clusters)") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))

Sensor (Satellite sensor)
hotspots %>%
select(sensor) %>%
count(sensor)
## # A tibble: 8 × 2
## sensor n
## <chr> <int>
## 1 AVHRR 164520
## 2 IBAND 400710
## 3 Landsat 1014
## 4 MODIS 240827
## 5 OLI 299
## 6 VIIRS 94345
## 7 VIIRS-I 869363
## 8 VIIRS-M 5487
sensor_cluster_count <- hotspots %>%
group_by(sensor) %>%
summarise(num_clusters = n_distinct(event_cluster))
# Create a bar plot of the number of clusters per satellite
ggplot(sensor_cluster_count, aes(x = sensor, y = num_clusters)) +
geom_bar(stat = "identity", fill = "steelblue") +
labs(title = "Number of Events Reported by Each Sensor",
x = "Sensor",
y = "Number of Events (clusters)") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))

#Advanced Very High Resolution Radiometer (AVHRR) imagery, courtesy of the NOAA
#Moderate Resolution Imaging Spectroradiometer (MODIS) imagery, courtesy of the NASA
#Visible Infrared Imaging Radiometer Suite (VIIRS) imagery, courtesy of NASA LANCE FIRMS
Agency (indicate province)
hotspots %>%
select(agency) %>%
count(agency)
## # A tibble: 1 × 2
## agency n
## <chr> <int>
## 1 BC 1776565
# only output is BC as the main area of concentration
Variables with numerical columns
# Identify numerical columns to work with
numerical_columns <- c('temp',
'rh',
'ws',
'wd',
'pcp',
'ffmc',
'dmc',
'dc',
'isi',
'bui',
'fwi',
'ros',
'sfc',
'tfc',
'bfc',
'hfi',
'cfb',
'age',
'estarea',
'pcuring',
'cfactor',
'greenup',
'elev',
'cfl',
'tfc0',
'sfl',
'ecozone',
'sfc0',
'cbh')
Statistical summary of numerical variables in hotspots dataset.
# Function to describe each numerical column
describe_numerical <- function(df, cols) {
summary_list <- list()
for (col in cols) {
summary_stats <- data.frame(
Variable = col,
Missing_Values = sum(is.na(df[[col]])),
Min = round(min(df[[col]], na.rm = TRUE), 2),
Median = round(median(df[[col]], na.rm = TRUE), 2),
Mean = round(mean(df[[col]], na.rm = TRUE), 2),
Max = round(max(df[[col]], na.rm = TRUE), 2)
)
summary_list[[col]] <- summary_stats
}
summary_table <- bind_rows(summary_list)
return(summary_table)
}
# Describe numerical columns for hotspots
summary_hotspots <- describe_numerical(hotspots, numerical_columns)
# Print the summary table
print(summary_hotspots)
## Variable Missing_Values Min Median Mean Max
## 1 temp 0 -21.35 22.07 21.03 43.88
## 2 rh 0 0.00 33.00 36.18 100.00
## 3 ws 0 0.00 8.94 9.53 59.30
## 4 wd 0 0.00 212.00 201.81 360.00
## 5 pcp 0 0.00 0.00 0.27 651.79
## 6 ffmc 0 0.00 91.42 88.78 99.00
## 7 dmc 0 0.00 83.80 90.50 459.78
## 8 dc 0 0.00 528.37 513.39 1122.11
## 9 isi 75 0.00 8.52 8.33 137.70
## 10 bui 75 0.00 117.90 120.70 458.66
## 11 fwi 75 0.00 29.73 28.47 183.90
## 12 ros 631 -429.82 4.27 6.53 96.23
## 13 sfc 631 0.00 3.09 2.69 4.99
## 14 tfc 631 0.00 3.19 3.13 9.95
## 15 bfc 749567 0.00 4.13 298.72 45488500.00
## 16 hfi 631 -91845.00 3665.00 8561.24 93142.00
## 17 cfb 55679 0.00 0.00 33.65 100.00
## 18 age 1378187 0.00 0.00 552.53 6742.00
## 19 estarea 1249981 0.00 8.46 6.48 20.92
## 20 pcuring 695143 -1.00 36.00 38.02 125.00
## 21 cfactor 1034202 -1.00 0.04 0.13 1.00
## 22 greenup 695158 -1.00 1.00 0.77 1527.00
## 23 elev 40851 -1.00 981.00 1006.76 3129.00
## 24 cfl 374667 -1.00 0.97 1.14 8.20
## 25 tfc0 377010 0.00 3.19 2.95 6.04
## 26 sfl 746145 -1.00 7.06 9.02 37.35
## 27 ecozone 742841 4.00 14.00 10.51 14.00
## 28 sfc0 742862 0.00 2.93 2.53 4.95
## 29 cbh 1442811 -1.00 8.44 6.99 17.79
Statistical summary of numerical variables in peak period from the
hotspots dataset
# Describe numerical columns for hotspots_peak
summary_hotspots_peak <- describe_numerical(hotspots_peak, numerical_columns)
# Print the summary table
print(summary_hotspots_peak)
## Variable Missing_Values Min Median Mean Max
## 1 temp 0 -9.66 22.24 21.48 43.88
## 2 rh 0 7.00 33.00 35.50 100.00
## 3 ws 0 0.00 8.95 9.53 59.30
## 4 wd 0 0.00 213.00 202.23 360.00
## 5 pcp 0 0.00 0.00 0.23 651.79
## 6 ffmc 0 0.30 91.50 89.51 99.00
## 7 dmc 0 0.00 85.11 92.50 459.78
## 8 dc 0 0.00 531.53 518.46 1122.11
## 9 isi 0 0.00 8.62 8.49 137.70
## 10 bui 0 0.00 119.03 123.29 458.66
## 11 fwi 0 0.00 30.06 29.10 183.90
## 12 ros 0 0.00 4.44 6.68 96.23
## 13 sfc 0 0.00 3.12 2.75 4.99
## 14 tfc 0 0.00 3.23 3.20 9.95
## 15 bfc 734414 0.00 4.15 138.96 45488500.00
## 16 hfi 0 0.00 3867.00 8761.14 93142.00
## 17 cfb 44273 0.00 0.00 34.23 100.00
## 18 age 1350060 0.00 0.00 557.06 5971.00
## 19 estarea 1223385 0.00 8.46 6.62 20.92
## 20 pcuring 687686 0.00 36.00 37.37 125.00
## 21 cfactor 1007287 0.00 0.04 0.13 1.00
## 22 greenup 687686 -1.00 1.00 0.61 1.00
## 23 elev 36148 -1.00 979.00 1005.88 3129.00
## 24 cfl 361971 -1.00 0.96 1.14 8.20
## 25 tfc0 363838 0.00 3.22 3.01 6.04
## 26 sfl 727326 -1.00 7.11 9.11 37.35
## 27 ecozone 727354 4.00 14.00 10.43 14.00
## 28 sfc0 727326 0.00 2.96 2.59 4.95
## 29 cbh 1415153 -1.00 8.38 6.92 17.55
The statistical summary of the hotspots_peak dataset
shows that there are no missing values in key variables, used by
Canadian Forest Fire Weather Index (FWI) System, therefore we can
continue with visual analysis and provide detailed
description.
Summarize table with mean values for plots.
# Create a table with mean values for plots
# Monthly averages for numerical variables
monthly_avg <- hotspots_peak %>%
group_by(year, month) %>%
summarise(avg_temp = mean(temp, na.rm = TRUE),
avg_rh = mean(rh, na.rm = TRUE),
avg_ws = mean(ws, na.rm = TRUE),
avg_pcp = mean(pcp, na.rm = TRUE),
avg_ffmc = mean(ffmc, na.rm = TRUE),
avg_dmc = mean(dmc, na.rm = TRUE),
avg_dc = mean(dc, na.rm = TRUE),
avg_isi = mean(isi, na.rm = TRUE),
avg_sfc = mean(sfc, na.rm = TRUE),
avg_tfc = mean(tfc, na.rm = TRUE),
avg_bfc = mean(bfc, na.rm = TRUE),
avg_hfi = mean(hfi, na.rm = TRUE),
avg_bui = mean(bui, na.rm = TRUE),
avg_fwi = mean(fwi, na.rm = TRUE),
avg_ros = mean(ros, na.rm = TRUE),
.groups = 'drop')
print(monthly_avg)
## # A tibble: 60 × 17
## year month avg_temp avg_rh avg_ws avg_pcp avg_ffmc avg_dmc avg_dc avg_isi
## <dbl> <ord> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 2014 May 16.0 29.8 9.97 1.00 87.4 28.8 79.1 8.24
## 2 2014 Jun 20.4 27.8 9.62 0.618 89.8 58.6 177. 9.55
## 3 2014 Jul 26.9 24.4 9.80 0.0342 94.3 83.5 384. 13.8
## 4 2014 Aug 25.1 31.6 7.86 0.157 91.9 99.7 534. 9.29
## 5 2014 Sep 21.7 32.7 8.11 0.147 90.4 71.9 672. 8.75
## 6 2014 Oct 8.93 73.4 8.02 2.38 49.0 7.72 403. 0.707
## 7 2015 May 18.8 30.8 10.5 0.0850 91.6 34.7 102. 9.87
## 8 2015 Jun 23.3 34.5 10.8 0.284 90.0 62.5 329. 8.33
## 9 2015 Jul 24.2 34.8 11.0 0.401 91.2 83.5 408. 9.60
## 10 2015 Aug 22.5 35.1 9.94 0.124 91.5 102. 532. 9.88
## # ℹ 50 more rows
## # ℹ 7 more variables: avg_sfc <dbl>, avg_tfc <dbl>, avg_bfc <dbl>,
## # avg_hfi <dbl>, avg_bui <dbl>, avg_fwi <dbl>, avg_ros <dbl>
Temp (Temperature)
# This variable shows temperature in Celsius at the specific location, at the fire event
par(mfrow=c(1,2))
# Histogram to show distribution of temperatures
ggplot(hotspots, aes(x = temp)) +
geom_histogram(binwidth = 1, fill = "steelblue", color = "black", alpha = 0.7) +
labs(title = "Distribution of Temperature at Fire Hotspots",
x = "Temperature (°C)",
y = "Frequency") +
scale_y_continuous(labels = comma) + # Adjust y-axis labels to numeric format (scales package)
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5, size = 15),
axis.title.x = element_text(size = 12),
axis.title.y = element_text(size = 12),
axis.text.x = element_text(size = 10),
axis.text.y = element_text(size = 10))

# Histogram to show distribution of temp for hotspots_peak
ggplot(hotspots_peak, aes(x = temp)) +
geom_histogram(binwidth = 1, fill = "steelblue", color = "black", alpha = 0.7) +
labs(title = "Distribution of Temperature at Fire Hotspots (Peak)",
x = "Temperature (°C)",
y = "Frequency") +
scale_y_continuous(labels = comma) +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5, size = 15),
axis.title.x = element_text(size = 12),
axis.title.y = element_text(size = 12),
axis.text.x = element_text(size = 10),
axis.text.y = element_text(size = 10))

Average temperature by year
# Creat a summary table with mean temperature by year
mean_temp_by_year <- hotspots_peak %>%
group_by(year) %>%
summarise(mean_temp = mean(temp, na.rm = TRUE))
# Create a line plot to show temperature trends over years
ggplot(mean_temp_by_year, aes(x = year, y = mean_temp)) +
geom_line(color = "steelblue", size = 1) +
geom_point(color = "black", size = 2) +
labs(title = "Average Temperature Over Years (Peak)",
x = "Year",
y = "Average Temperature (°C)") +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5, size = 15),
axis.title.x = element_text(size = 12),
axis.title.y = element_text(size = 12),
axis.text.x = element_text(size = 10),
axis.text.y = element_text(size = 10))
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

# Create a line plot for temperature in 2019 for closer look of drop temperature.
ggplot(hotspots_peak %>% filter(year == 2019), aes(x = rep_date, y = temp)) +
geom_line(color = "steelblue") +
labs(title = "Temperature Over the Year 2019",
x = "Date",
y = "Temperature (°C)") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))

The line plot shows that temperature gradually drop from
~20 degree to 0 degree started from July to October indicated the end of
dry season.
Closer look for the temperature comparison from 4 major
different years with most case of fires. 2014 : started of hotspot
dataset 2018 : the historical year of wildfire in BC 2020 : the global
pandemic year where not much fires recorded 2023 : the largest wildfire
events.
# Define common axis limits
y_limits <- range(hotspots_peak$temp[hotspots_peak$year %in% c(2014, 2018, 2020, 2023)])
# Create plots with common axis limits
plot_2014 <- ggplot(hotspots_peak %>% filter(year == 2014), aes(x = rep_date, y = temp)) +
geom_line(color = "steelblue") +
labs(title = "Temperature Over the Peak Season 2014",
x = "Date",
y = "Temperature (°C)") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
scale_y_continuous(limits = y_limits)
plot_2018 <- ggplot(hotspots_peak %>% filter(year == 2018), aes(x = rep_date, y = temp)) +
geom_line(color = "steelblue") +
labs(title = "Temperature Over the Peak Season 2018",
x = "Date",
y = "Temperature (°C)") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
scale_y_continuous(limits = y_limits)
plot_2020 <- ggplot(hotspots_peak %>% filter(year == 2020), aes(x = rep_date, y = temp)) +
geom_line(color = "steelblue") +
labs(title = "Temperature Over the Peak Season 2020",
x = "Date",
y = "Temperature (°C)") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
scale_y_continuous(limits = y_limits)
plot_2023 <- ggplot(hotspots_peak %>% filter(year == 2023), aes(x = rep_date, y = temp)) +
geom_line(color = "steelblue") +
labs(title = "Temperature Over the Peak Season 2023",
x = "Date",
y = "Temperature (°C)") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
scale_y_continuous(limits = y_limits)
# Create Line Plots for 4 different years to compare temp
grid.arrange(plot_2014, plot_2018, plot_2020, plot_2023, ncol = 2)

# Trend on temperature drop in October from different years, showing consistent trend and considered common because its beginning of Fall season in BC
RH (Relative Humidity)
# The amount of moisture in the air as a percentage
# 0 to 100 with a mean 36
# Create Line Plots for 4 different years to compare RH
# Define common axis limits
y_limits <- range(hotspots_peak$rh[hotspots_peak$year %in% c(2014, 2018, 2020, 2023)])
# Create plots with common axis limits
plot_2014 <- ggplot(hotspots_peak %>% filter(year == 2014), aes(x = rep_date, y = rh)) +
geom_line(color = "steelblue") +
labs(title = "Humidity Over the Peak Season 2014",
x = "Date",
y = "Relative Humidity (%)") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
scale_y_continuous(limits = y_limits)
plot_2018 <- ggplot(hotspots_peak %>% filter(year == 2018), aes(x = rep_date, y = rh)) +
geom_line(color = "steelblue") +
labs(title = "Humidity Over the Peak Season 2018",
x = "Date",
y = "Relative Humidity (%)") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
scale_y_continuous(limits = y_limits)
plot_2020 <- ggplot(hotspots_peak %>% filter(year == 2020), aes(x = rep_date, y = rh)) +
geom_line(color = "steelblue") +
labs(title = "Humidity Over the Peak Season 2020",
x = "Date",
y = "Relative Humidity (%)") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
scale_y_continuous(limits = y_limits)
plot_2023 <- ggplot(hotspots_peak %>% filter(year == 2023), aes(x = rep_date, y = rh)) +
geom_line(color = "steelblue") +
labs(title = "Humidity Over the Peak Season 2023",
x = "Date",
y = "Relative Humidity (%)") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
scale_y_continuous(limits = y_limits)
# Arrange the plots side by side
grid.arrange(plot_2014, plot_2018, plot_2020, plot_2023, ncol = 2)

# The variability in relative humidity makes it difficult to establish a consistent trend.
Comparison between temperature and humidity (RH) in monthly
perspective
# Plot monthly temperature averages
temp_plot <- ggplot(monthly_avg, aes(x = month, y = avg_temp, color = factor(year), group = year)) +
geom_line() +
labs(title = "Average Temperature by Month",
x = "Month",
y = "Average Temperature (°C)",
color = "Year") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
# Plot monthly humidity averages
humidity_plot <- ggplot(monthly_avg, aes(x = month, y = avg_rh, color = factor(year), group = year)) +
geom_line() +
labs(title = "Average Humidity by Month",
x = "Month",
y = "Average Humidity (%)",
color = "Year") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
grid.arrange(temp_plot, humidity_plot, ncol = 1)

Comparison between temperature and humidity (RH) from 4 different
years.
# Filter for specific years
monthly_avg_filtered <- monthly_avg %>%
filter(year %in% c(2014, 2018, 2020, 2023))
# Plot monthly temperature averages
temp_plot_filtered <- ggplot(monthly_avg_filtered, aes(x = month, y = avg_temp, color = factor(year), group = year)) +
geom_line(size = 1) +
labs(title = "Average Temperature by Month",
x = "Month",
y = "Average Temperature (°C)",
color = "Year") +
scale_color_manual(values = c("2014" = "#F8766D", "2018" = "#00C19F", "2020" = "#619CFF", "2023" = "#FF61C3")) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
# Plot monthly humidity averages
humidity_plot_filtered <- ggplot(monthly_avg_filtered, aes(x = month, y = avg_rh, color = factor(year), group = year)) +
geom_line(size = 1) +
labs(title = "Average Humidity by Month",
x = "Month",
y = "Average Humidity (%)",
color = "Year") +
scale_color_manual(values = c("2014" = "#F8766D", "2018" = "#00C19F", "2020" = "#619CFF", "2023" = "#FF61C3")) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
# Arrange the plots side by side
grid.arrange(temp_plot_filtered, humidity_plot_filtered, ncol = 1)

There is a combination of high temperatures and low
humidity during the peak fire months (June to September). 2018 and 2023
had higher temperatures and lower humidity, meaning more fires. When
humidity rises sharply in October, fire activity goes down,indicating
the end of the peak fire season.
Wind speed (in km/h)
# 0 to 59 with mean of 9
# Higher wind speeds in peak fire months can make fire more intense and make it spread faster.
# Common wind speed scale used is the Beaufort scale (in m/s)
# Convert wind speed from km/h to m/s (t) - to plot wd later
hotspots_peak$ws <- hotspots_peak$ws * 0.27778
# Plot monthly wind speed averages with custom colors
ws_plot_filtered <- ggplot(monthly_avg_filtered, aes(x = month, y = avg_ws, color = factor(year), group = year)) +
geom_line(size = 1) +
labs(title = "Average Wind Speed by Month",
x = "Month",
y = "Average Wind Speed (m/s)",
color = "Year") +
scale_color_manual(values = c("2014" = "#F8766D", "2018" = "#00C19F", "2020" = "#619CFF", "2023" = "#FF61C3")) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
# Arrange the plots side by side
grid.arrange(temp_plot_filtered, humidity_plot_filtered, ws_plot_filtered, ncol = 1)

Wind speed shows significant variability
“wd” (Wind direction)
# Wind exploration with openair() package.
# Visualise with Wind Rose to see most common wind direction
library(openair)
windRose(mydata = hotspots_peak, ws = "ws", wd = "wd",
main = "Wind Rose", paddle = FALSE)

# Most of the wind comes from the west (W) and southwest (SW).
# Regions east and north of areas with strong western, southern and southwestern winds
# should be careful, as these winds can quickly spread fires.
# Wind patterns have are important for wildfire management.
Most of the wind comes from the west (W) and southwest
(SW).Regions east and north of areas with strong western, southern and
southwestern wind should be careful, as these winds can quickly spread
fires.
Wind patterns have are important for wildfire
management. Higher wind speeds in peak fire months can make fire more
intense and make it spread faster.Wind speed shows significant
variability.
Fire Weather Index System indices
FFMC (Fine Fuel Moisture Code)
# A numeric rating of the moisture content of litter and other cured fine fuels.
# This code is an indicator of the relative ease of ignition and the flammability of fine fuel.
# The FFMC scale ranges from 0 to 101.
# 0-30: Very wet conditions; ignition is difficult.
# 30-70: Damp conditions; moderate difficulty for ignition.
# 70-85: Dry conditions; fuels are easily ignitable.
# 85-101: Very dry conditions; fuels are highly ignitable and fires can spread rapidly.
# Plot histogram for FFMC
ggplot(hotspots_peak, aes(x = ffmc)) +
geom_histogram(binwidth = 2, fill = "skyblue", color = "black", alpha = 0.7) +
labs(title = "Distribution of FFMC Values", x = "FFMC", y = "Frequency") +
scale_y_continuous(labels = comma) +
theme_minimal()

The histogram reveals that most FFMC values from 2014 to 2023
are between 85 and 99,confirming consistently dry conditions.
# Plot boxplot for FFMC
ggplot(hotspots_peak, aes(x = factor(year), y = ffmc)) +
geom_boxplot(fill = "steelblue", color = "black", alpha = 0.7) +
labs(title = "FFMC Distribution Across Years",
x = "Year",
y = "FFMC") +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5, size = 15),
axis.title.x = element_text(size = 12),
axis.title.y = element_text(size = 12),
axis.text.x = element_text(size = 10, angle = 45, hjust = 1),
axis.text.y = element_text(size = 10))

The boxplots show that most years have high FFMC values above
90,meaning conditions are very dry and prone to fires. Year of 2020 and
2019 are 2 years of less drought conditions for fine fuels
# Line plot for FFMC over time
ggplot(monthly_avg, aes(x = month, y = avg_ffmc, color = factor(year), group = year)) +
geom_line(size = 0.5, alpha = 0.6, linetype = "dotted") + # raw data
geom_smooth(se = FALSE, method = "loess", size = 1, linetype = "solid") + # Dashed line for trend
labs(title = "FFMC by Month",
x = "Month",
y = "FFMC",
color = "Year") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
## `geom_smooth()` using formula = 'y ~ x'

The line plot shows that FFMC values peak every year during the
fire season,indicating drier conditions that make it easier for fires to
start.
DMC (Duff Moisture Code)
# A numeric rating of the average moisture content
# of loosely compacted organic layers of moderate depth.
# This code gives an indication of fuel consumption in moderate duff layers
# and medium-size woody material.
# Values below 20: Low fire danger, duff layers are wet, and ignition is unlikely.
# Values between 21 and 40: Moderate fire danger, duff layers start drying.
# Values between 41 and 100: High fire danger, the duff layer is dry, prone to ignition.
# Values above 100: Extreme fire danger, the duff layer is very dry, and ignition is very likely with the potential for intense fires.
# Plot histogram for DMC
ggplot(hotspots_peak, aes(x = dmc)) +
geom_histogram(binwidth = 2, fill = "skyblue", color = "black", alpha = 0.7) +
labs(title = "Distribution of DMC Values", x = "DMC", y = "Frequency") +
scale_y_continuous(labels = comma) +
theme_minimal()

The histogram shows that most DMC values range between 0 and
100,with a peak around 50-100. Values above 200 are less common but
indicate long dry periods.
# Plot boxplot for DMC
ggplot(hotspots_peak, aes(x = factor(year), y = dmc)) +
geom_boxplot(fill = "steelblue", color = "black", alpha = 0.7) +
labs(title = "DMC Distribution Across Years",
x = "Year",
y = "DMC") +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5, size = 15),
axis.title.x = element_text(size = 12),
axis.title.y = element_text(size = 12),
axis.text.x = element_text(size = 10, angle = 45, hjust = 1),
axis.text.y = element_text(size = 10))

The boxplot for each year reveals that 2017 and 2018 had
particularly high outliers,indicating extremely dry conditions that
could lead to fires.
On the other hand, 2016 and 2019 had lower DMC values, showing
less severe drought.
# Line plot for DMC over time
ggplot(monthly_avg, aes(x = month, y = avg_dmc, color = factor(year), group = year)) +
geom_line(size = 0.5, alpha = 0.6, linetype = "dotted") + # raw data
geom_smooth(se = FALSE, method = "loess", size = 1, linetype = "solid") + # Dashed line for trend
labs(title = "DMC by Month",
x = "Month",
y = "DMC",
color = "Year") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
## `geom_smooth()` using formula = 'y ~ x'

The line plot indicates that DMC values peak in late summer
(July-August) and decrease in autumn.
This shows that mid to late summer is typically drier,the risk
of fire ignition and spread is high during these months.
DC (Drought Code)
# A numeric rating of the average moisture content of deep, compact organic layers.
# This code is a useful indicator of seasonal drought effects on forest fuels
# and the amount of smoldering in deep duff layers and large logs.
# 0-100: Indicates wet conditions with low fire potential.
# 100-300: Moderate drought conditions with increasing fire potential.
# 300-500: High drought conditions, leading to high fire risk.
# 500+: Indicates extreme drought, posing a very high fire risk and potential for intense, prolonged burning.
# Plot histogram for DC
ggplot(hotspots_peak, aes(x = dc)) +
geom_histogram(binwidth = 20, fill = "skyblue", color = "black", alpha = 0.7) +
labs(title = "Distribution of DC Values", x = "DC", y = "Frequency") +
scale_y_continuous(labels = comma) +
theme_minimal()

The histogram of DC values from 2014 to 2023 shows most values
are between 300 and 600,peaking around 500. This means BC often has
conditions that can lead to deep-burning fires.
# Plot boxplot for DC
ggplot(hotspots_peak, aes(x = factor(year), y = dc)) +
geom_boxplot(fill = "steelblue", color = "black", alpha = 0.7) +
labs(title = "DC Distribution Across Years",
x = "Year",
y = "DC") +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5, size = 15),
axis.title.x = element_text(size = 12),
axis.title.y = element_text(size = 12),
axis.text.x = element_text(size = 10, angle = 45, hjust = 1),
axis.text.y = element_text(size = 10))

The boxplot of DC values across years show the variability and
trends in drought conditions over time.
2017 and 2018: These years have higher median and upper whisker
values, meaning very dry conditions, likely leading to severe fire
seasons.
2016 and 2019: These years show lower DC values, suggesting
wetter conditions and potentially less severe fire activity.
# Line plot for DC over time
ggplot(monthly_avg, aes(x = month, y = avg_dc, color = factor(year), group = year)) +
geom_line(size = 0.5, alpha = 0.6, linetype = "dotted") + # raw data
geom_smooth(se = FALSE, method = "loess", size = 1, linetype = "solid") + # Dashed line for trend
labs(title = "DC by Month",
x = "Month",
y = "DC",
color = "Year") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
## `geom_smooth()` using formula = 'y ~ x'

The line plot of DC values by month shows a clear seasonal
trend, DC values increase during the summer months (July to August) and
decrease in the autumn. This occurs at the time of the peak fire season.
There are a lot of high DC values, more than 500, it shows conditions
for deep-burning fires.
Compare indices trends to timeline of fire events
# Normalize the number of fire events to allow for a meaningful comparison with the indices.
# Merge the dataframes
monthly_data <- merge(monthly_avg, fire_events_per_month, by = c("year", "month"))
# Convert 'month' column from ordered factor to numeric
monthly_data$month <- as.numeric(monthly_data$month)
# Create the 'Date' column using 'year' and 'month'
monthly_data$Date <- as.Date(paste(monthly_data$year, monthly_data$month, "01", sep = "-"))
# Normalize the fire numbers data
max_events <- max(monthly_data$n_events, na.rm = TRUE)
monthly_data <- monthly_data %>%
mutate(norm_n_events = n_events / max_events)
# Maximum values for scaling
max_ffmc <- max(monthly_data$avg_ffmc, na.rm = TRUE)
max_dmc <- max(monthly_data$avg_dmc, na.rm = TRUE)
max_dc <- max(monthly_data$avg_dc, na.rm = TRUE)
Visualization between Fire Indices with fire counts (FFMC, DMC,
DC)
# Plot FFMC and Fire Counts
ggplot(monthly_data, aes(x = Date)) +
geom_line(aes(y = avg_ffmc, color = "FFMC"), size = 1) +
geom_bar(aes(y = norm_n_events * max_ffmc, fill = "Fire Occurrences"), stat = "identity", color = "black", alpha = 0.6) +
scale_y_continuous(
name = "FFMC",
sec.axis = sec_axis(~ . * 1000 / max_ffmc * max(monthly_data$n_events) / 1000,
name = "Number of Fire Occurrences", labels = comma)) +
scale_x_date(date_labels = "%Y", date_breaks = "1 year") +
labs(title = "Monthly Trends of FFMC and Fire Occurrences",
x = "Year",
color = "Index",
fill = "Index") +
scale_color_manual(values = c("FFMC" = "lightblue")) +
scale_fill_manual(values = c("Fire Occurrences" = "red")) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))

# Plot DMC and Fire Counts
ggplot(monthly_data, aes(x = Date)) +
geom_line(aes(y = avg_dmc, color = "DMC"), size = 1) +
geom_bar(aes(y = norm_n_events * max_dmc, fill = "Fire Occurrences"), stat = "identity", color = "black", alpha = 0.6) +
scale_y_continuous(
name = "DMC",
sec.axis = sec_axis(~ . * 1000 / max_dmc * max(monthly_data$n_events) / 1000,
name = "Number of Fire Occurrences", labels = comma)) +
scale_x_date(date_labels = "%Y", date_breaks = "1 year") +
labs(title = "Monthly Trends of DMC and Fire Occurrences",
x = "Year",
color = "Index",
fill = "Index") +
scale_color_manual(values = c("DMC" = "lightblue")) +
scale_fill_manual(values = c("Fire Occurrences" = "red")) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))

# Plot DC and Fire Counts
ggplot(monthly_data, aes(x = Date)) +
geom_line(aes(y = avg_dc, color = "DC"), size = 1) +
geom_bar(aes(y = norm_n_events * max_dc, fill = "Fire Occurrences"), stat = "identity", color = "black", alpha = 0.6) +
scale_y_continuous(
name = "DC",
sec.axis = sec_axis(~ . * 1000 / max_dc * max(monthly_data$n_events) / 1000,
name = "Number of Fire Occurrences", labels = comma)) +
scale_x_date(date_labels = "%Y", date_breaks = "1 year") +
labs(title = "Monthly Trends of DC and Fire Occurrences",
x = "Year",
color = "Index",
fill = "Index") +
scale_color_manual(values = c("DC" = "lightblue")) +
scale_fill_manual(values = c("Fire Occurrences" = "red")) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))

The plots show a clear relationship between the fire indices
(FFMC, DMC, and DC) and the number of fires. When the number of fires is
high, the indices also show higher values.
ISI (Initial Spread Index)
# A numerical rating of the expected rate of fire spread
# based on wind speed, temperature, and fine fuel moisture content.
# ISI is crucial for understanding how quickly a fire can spread once it has ignited.
# 0-3: Low spread potential. Fires will spread slowly and are relatively easy to control.
# 4-7: Moderate spread potential. Fires spread more quickly and may require more effort to control.
# 8-12: High spread potential. Fires spread rapidly and can be difficult to control.
# 13-19: Very high spread potential. Fires spread very rapidly and are challenging to control.
# 20+: Extreme spread potential. Fires spread uncontrollably and can be extremely dangerous.
# Plot histogram for ISI
ggplot(hotspots_peak, aes(x = isi)) +
geom_histogram(binwidth = 1, fill = "skyblue", color = "black", alpha = 0.7) +
labs(title = "Distribution of ISI Values", x = "ISI", y = "Frequency") +
scale_y_continuous(labels = scales::comma) +
theme_minimal()

The histogram indicates that most ISI values are low to
moderate.
# Plot boxplot for ISI
ggplot(hotspots_peak, aes(x = factor(year), y = isi)) +
geom_boxplot(fill = "steelblue", color = "black", alpha = 0.7) +
labs(title = "ISI Distribution Across Years",
x = "Year",
y = "ISI") +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5, size = 15),
axis.title.x = element_text(size = 12),
axis.title.y = element_text(size = 12),
axis.text.x = element_text(size = 10, angle = 45, hjust = 1),
axis.text.y = element_text(size = 10))

The boxplot shows variability in ISI values across years, with
notable outliers in 2014 and 2020.
# Line plot for ISI over time
ggplot(monthly_avg, aes(x = month, y = avg_isi, color = factor(year), group = year)) +
geom_line(size = 0.5, alpha = 0.6, linetype = "dotted") + # raw data
geom_smooth(se = FALSE, method = "loess", size = 1, linetype = "solid") + # Smoothed trend line
labs(title = "ISI by Month",
x = "Month",
y = "ISI",
color = "Year") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
## `geom_smooth()` using formula = 'y ~ x'

The graphs illustrate that ISI values tend to peak during the
summer months,indicating higher fire spread potential during this
period.
Analyze extreme outlier of ISI
# To improve the clarity of these insights,
# removing NA values and extreme outliers from the dataset is necessary.
# Remove extreme outliers
hotspots_peak_filtered <- hotspots_peak%>%
filter(isi < 60)
dim(hotspots_peak_filtered)
## [1] 1734602 42
# Filter out entries with ISI values greater than 60
event_outliers <- hotspots_peak %>%
filter(isi >= 60)
# Check single event
dim(event_outliers)
## [1] 11 42
view(event_outliers)
Sample of one specific event with FWI index
# The data from July 19, 2014, shows very high ISI values at a specific location in British Columbia.
# This matches the date of the Mount McAllister fire, a large wildfire that burned over 20,000 hectares.
# The weather that day included high temperatures, low humidity, and strong winds, which made the fire spread quickly.
# These conditions explain the high ISI values, indicating a high potential for severe fire behavior.
# The data points from July 19, 2014, are all in the same cluster, showing that the clustering algorithm grouped them correctly.
# Filter the specific event cluster and near events
event_McAllister <- hotspots %>%
filter(event_cluster %in% 370:372)
# Calculate monthly averages for July 2014
monthly_avg_july <- monthly_avg %>%
filter(year == 2014 & month == "Jul")
# Plot the weather conditions with average lines
ggplot(event_McAllister, aes(x = rep_date)) +
geom_point(aes(y = temp, color = "Temperature"), size = 3) +
geom_point(aes(y = rh, color = "Relative Humidity"), size = 3) +
geom_point(aes(y = ws, color = "Wind Speed"), size = 3) +
geom_hline(aes(yintercept = monthly_avg_july$avg_temp, color = "Avg Temperature"), linetype = "dashed") +
geom_hline(aes(yintercept = monthly_avg_july$avg_rh, color = "Avg Relative Humidity"), linetype = "dashed") +
geom_hline(aes(yintercept = monthly_avg_july$avg_ws, color = "Avg Wind Speed"), linetype = "dashed") +
labs(title = "Weather Conditions on July 19, 2014 with Monthly Averages",
x = "Date",
y = "Value",
color = "Variable") +
scale_color_manual(values = c("Temperature" = "red3", "Relative Humidity" = "steelblue2", "Wind Speed" = "green",
"Avg Temperature" = "red3", "Avg Relative Humidity" = "steelblue2", "Avg Wind Speed" = "green")) +
theme_minimal()

# Create a summary table of the weather conditions
weather_summary <- event_McAllister %>%
select(lat, lon, rep_date, temp, rh, ws, wd, pcp, ffmc, dmc, dc, isi, bui, fwi, ros)
# Print the summary table
print(weather_summary)
## # A tibble: 110 × 15
## lat lon rep_date temp rh ws wd pcp ffmc dmc
## <dbl> <dbl> <dttm> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 53.2 -126. 2014-08-08 04:21:00 14.3 60 10.2 0 0.2 87.9 119.
## 2 50.3 -122. 2014-07-19 21:24:00 28.4 28 22.6 206 0 93.3 201
## 3 53.3 -125. 2014-07-16 20:45:00 27.3 23 6.6 249 0 95.1 86
## 4 53.3 -125. 2014-07-16 20:49:00 26.6 22 7.4 22 0 95.1 91
## 5 50.3 -122. 2014-07-19 21:24:00 27.5 33 59.3 190 0 93.4 237.
## 6 53.2 -125. 2014-08-08 04:21:00 14.3 60 10.2 0 0.2 87.9 119.
## 7 53.3 -126. 2014-07-16 20:49:00 26.6 22 7.4 22 0 95.1 91
## 8 53.4 -126. 2014-07-16 20:45:00 26.6 22 7.4 22 0 95.1 91
## 9 53.3 -125. 2014-07-16 20:49:00 27.3 23 6.6 249 0 95.1 86
## 10 53.3 -126. 2014-07-16 20:49:00 26.6 22 7.4 22 0 95.1 91
## # ℹ 100 more rows
## # ℹ 5 more variables: dc <dbl>, isi <dbl>, bui <dbl>, fwi <dbl>, ros <dbl>
This graph shows the weather conditions at McAllister Creek on
July 19, 2014,including temperature, relative humidity, and wind speed.
It compares these values to the average conditions for July
2014.
The wind speed during this event is notably higher than the
monthly average,suggesting stronger winds that could help spread the
fire.
Visualization of ISI after removing extreme value
# Plot histogram for ISI
ggplot(hotspots_peak, aes(x = isi)) +
geom_histogram(binwidth = 2, fill = "skyblue", color = "black", alpha = 0.7) +
labs(title = "Distribution of ISI Values", x = "ISI", y = "Frequency") +
scale_y_continuous(labels = scales::comma) +
theme_minimal()

The histogram now shows a more typical range of ISI values,
removing the extreme values that distorted the original plot.
# Plot boxplot for ISI
ggplot(hotspots_peak_filtered, aes(x = factor(year), y = isi)) +
geom_boxplot(fill = "steelblue", color = "black", alpha = 0.7) +
labs(title = "ISI Distribution Across Years",
x = "Year",
y = "ISI") +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5, size = 15),
axis.title.x = element_text(size = 12),
axis.title.y = element_text(size = 12),
axis.text.x = element_text(size = 10, angle = 45, hjust = 1),
axis.text.y = element_text(size = 10))

The boxplot shows a clearer, more consistent ISI distribution
across the years after removing high outliers.
# Line plot for ISI over time
ggplot(monthly_avg, aes(x = month, y = avg_isi, color = factor(year), group = year)) +
geom_line(size = 0.5, alpha = 0.6, linetype = "dotted") + # raw data
geom_smooth(se = FALSE, method = "loess", size = 1, linetype = "solid") + # Smoothed trend line
labs(title = "ISI by Month",
x = "Month",
y = "ISI",
color = "Year") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
## `geom_smooth()` using formula = 'y ~ x'

The seasonal pattern plot still peaks in mid-summer but appears
smoother without the extreme values.
Correlation of ISI and Weather Indices
# Using monthly averages instead of individual observations for quicker plotting
ggplot(monthly_avg, aes(x = avg_ws, y = avg_isi)) +
geom_point(alpha = 0.5) +
geom_smooth(method = "lm", se = FALSE, color = "blue") +
labs(title = "Average ISI vs Average Wind Speed", x = "Average Wind Speed (km/h)", y = "Average ISI") +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

Average ISI vs. Average Wind Speed: Shows that higher wind
speeds are linked to higher ISI values.
ggplot(monthly_avg, aes(x = avg_temp, y = avg_isi)) +
geom_point(alpha = 0.5) +
geom_smooth(method = "lm", se = FALSE, color = "red") +
labs(title = "Average ISI vs Average Temperature", x = "Average Temperature (°C)", y = "Average ISI") +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

Average ISI vs. Average Temperature: Shows that higher
temperatures are associated with higher ISI values.
ggplot(monthly_avg, aes(x = avg_rh, y = avg_isi)) +
geom_point(alpha = 0.5) +
geom_smooth(method = "lm", se = FALSE, color = "green") +
labs(title = "Average ISI vs Average Relative Humidity", x = "Average Relative Humidity (%)", y = "Average ISI") +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

Average ISI vs. Average Relative Humidity: Shows that as
relative humidity increases, ISI decreases.
Correlation matrix between ISI and weather variables (WS, temp,
RH)
# Select columns
data_corr <- hotspots_peak %>%
select(isi, ws, temp, rh)
# Calculate the correlation matrix
corr_matrix <- cor(data_corr)
# Plot the correlation matrix
corrplot(corr_matrix, method = "circle", type = "lower",
tl.col = "black", tl.srt = 45, title = "Correlation Matrix of ISI and Weather Variables",
mar = c(0, 0, 1, 0))

Strong negative correlation between ISI and relative humidity.
Strong positive correlation between ISI and temperature. Moderate
positive correlation between ISI and wind speed.
BUI (Buildup Index)
# A numerical rating of the total amount of fuel available for combustion.
# It is derived from the Duff Moisture Code (DMC) and the Drought Code (DC)
# Low: 0-40
# Moderate: 41-80
# High: 81-120
# Extreme: 121 and above
ggplot(hotspots_peak, aes(x = bui)) +
geom_histogram(binwidth = 10, fill = "skyblue", color = "black", alpha = 0.7) +
labs(title = "Distribution of BUI Values", x = "BUI", y = "Frequency") +
scale_y_continuous(labels = scales::comma) +
theme_minimal()

The histogram shows that BUI values mostly range from 60 to 100.
This means the fire potential is moderate to high for most of the
dataset.
ggplot(hotspots_peak, aes(x = factor(year), y = bui)) +
geom_boxplot(fill = "steelblue", color = "black", alpha = 0.7) +
labs(title = "BUI Distribution Across Years",
x = "Year",
y = "BUI") +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5, size = 15),
axis.title.x = element_text(size = 12),
axis.title.y = element_text(size = 12),
axis.text.x = element_text(size = 10, angle = 45, hjust = 1),
axis.text.y = element_text(size = 10))

Boxplots show BUI values vary year to year. Some years have
higher values, which means drier conditions and a higher chance of
intense fires.
ggplot(monthly_avg, aes(x = month, y = avg_bui, color = factor(year), group = year)) +
geom_line(size = 0.5, alpha = 0.6, linetype = "dotted") + # raw data
geom_smooth(se = FALSE, method = "loess", size = 1, linetype = "solid") + # Smoothed trend line
labs(title = "BUI by Month",
x = "Month",
y = "BUI",
color = "Year") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
## `geom_smooth()` using formula = 'y ~ x'

BUI peaks in mid-summer, which matches the time when fire
activity is usually the highest.
# Scatter plots to visualize relationships using average values
ggplot(monthly_avg, aes(x = avg_dmc, y = avg_bui)) +
geom_point(alpha = 0.7) +
geom_smooth(method = "lm", se = FALSE, color = "red") +
labs(title = "Average BUI vs Average DMC", x = "Average DMC", y = "Average BUI") +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

BUI and DC also show a positive correlation,though it is
slightly weaker.
Correlation matrix of BUI, DMC and FC
# Select columns
data_corr <- hotspots_peak %>%
select(bui, dmc, dc)
# Calculate the correlation matrix
corr_matrix <- cor(data_corr)
# Plot the correlation matrix
corrplot(corr_matrix, method = "circle", type = "lower",
tl.col = "black", tl.srt = 45, title = "Correlation Matrix of BUI, DMC, and DC",
mar = c(0, 0, 1, 0))

BUI, DMC, and DC are all indicators of fire potential. High
values in these indices suggest drier conditions, which can lead to
higher fire.
FWI (Fire Weather Index)
# A numeric rating of fire intensity. It is based on the ISI and the BUI,
# and is used as a general index of fire danger throughout the forested areas of Canada.
# Low (0-5): Minimal fire danger.
# Moderate (6-12): Fires can start from most accidental causes, but the spread is slow.
# High (13-20): Fires can start easily and spread rapidly.
# Very High (21-30): Fires will start very easily, spread rapidly, and burn intensely.
# Extreme (31+): Fires start and spread quickly, and are intense and challenging to control.
ggplot(hotspots_peak, aes(x = fwi)) +
geom_histogram(binwidth = 1, fill = "skyblue", color = "black", alpha = 0.7) +
labs(title = "Distribution of FWI Values", x = "FWI", y = "Frequency") +
scale_y_continuous(labels = scales::comma) +
theme_minimal()

The histogram shows that most FWI values are between 10 and 30,
fire danger is from moderate to very high. Many values are above 30,
severe fire weather conditions are frequent.
ggplot(hotspots_peak, aes(x = factor(year), y = fwi)) +
geom_boxplot(fill = "steelblue", color = "black", alpha = 0.7) +
labs(title = "FWI Distribution Across Years",
x = "Year",
y = "FWI") +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5, size = 15),
axis.title.x = element_text(size = 12),
axis.title.y = element_text(size = 12),
axis.text.x = element_text(size = 10, angle = 45, hjust = 1),
axis.text.y = element_text(size = 10))

The boxplot shows the distribution of FWI values for different
years.The line plot shows that FWI peaks in mid-summer, peak fire
activity period.The data shows significant variability in fire danger,
summer months and certain years have extreme outliers.
ggplot(monthly_avg, aes(x = month, y = avg_fwi, color = factor(year), group = year)) +
geom_line(size = 0.5, alpha = 0.6, linetype = "dotted") + # raw data
geom_smooth(se = FALSE, method = "loess", size = 1, linetype = "solid") + # Smoothed trend line
labs(title = "FWI by Month",
x = "Month",
y = "FWI",
color = "Year") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
## `geom_smooth()` using formula = 'y ~ x'

The line plot shows that FWI peaks in mid-summer, peak fire
activity period. The data shows significant variability in fire danger,
summer months and certain years have extreme outliers.
Analyze the trends of key indices (ISI BUI FWI) and the number of
fire events.
# Maximum values for scaling
max_isi <- max(monthly_data$avg_isi, na.rm = TRUE)
max_bui <- max(monthly_data$avg_bui, na.rm = TRUE)
max_fwi <- max(monthly_data$avg_fwi, na.rm = TRUE)
# Plot ISI and Fire Counts
ggplot(monthly_data, aes(x = Date)) +
geom_line(aes(y = avg_isi, color = "ISI"), size = 1) +
geom_bar(aes(y = norm_n_events * max_isi, fill = "Fire Occurrences"), stat = "identity", color = "black", alpha = 0.6) +
scale_y_continuous(
name = "ISI",
sec.axis = sec_axis(~ . * 1000 / max_isi * max(monthly_data$n_events) / 1000,
name = "Number of Fire Occurrences", labels = comma)) +
scale_x_date(date_labels = "%Y", date_breaks = "1 year") +
labs(title = "Monthly Trends of ISI and Fire Occurrences",
x = "Year",
color = "Index",
fill = "Index") +
scale_color_manual(values = c("ISI" = "steelblue")) +
scale_fill_manual(values = c("Fire Occurrences" = "red")) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))

# Plot BUI and Fire Counts
ggplot(monthly_data, aes(x = Date)) +
geom_line(aes(y = avg_bui, color = "BUI"), size = 1) +
geom_bar(aes(y = norm_n_events * max_bui, fill = "Fire Occurrences"), stat = "identity", color = "black", alpha = 0.6) +
scale_y_continuous(
name = "BUI",
sec.axis = sec_axis(~ . * 1000 / max_bui * max(monthly_data$n_events) / 1000,
name = "Number of Fire Occurrences", labels = comma)) +
scale_x_date(date_labels = "%Y", date_breaks = "1 year") +
labs(title = "Monthly Trends of BUI and Fire Occurrences",
x = "Year",
color = "Index",
fill = "Index") +
scale_color_manual(values = c("BUI" = "steelblue")) +
scale_fill_manual(values = c("Fire Occurrences" = "red")) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))

# Plot FWI and Fire Counts
ggplot(monthly_data, aes(x = Date)) +
geom_line(aes(y = avg_fwi, color = "FWI"), size = 1) +
geom_bar(aes(y = norm_n_events * max_fwi, fill = "Fire Occurrences"), stat = "identity", color = "black", alpha = 0.6) +
scale_y_continuous(
name = "FWI",
sec.axis = sec_axis(~ . * 1000 / max_fwi * max(monthly_data$n_events) / 1000,
name = "Number of Fire Occurrences", labels = comma)) +
scale_x_date(date_labels = "%Y", date_breaks = "1 year") +
labs(title = "Monthly Trends of FWI and Fire Occurrences",
x = "Year",
color = "Index",
fill = "Index") +
scale_color_manual(values = c("FWI" = "steelblue")) +
scale_fill_manual(values = c("Fire Occurrences" = "red")) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))

The plots show a clear link between indices and the number of
fires.When there are more fires, the FWI is also higher.
Fuel (Fuel Type)
# D1: Deciduous trees (leafless, early spring to fall)
# C2, C3, C4, C5, C7: Various types of coniferous trees
# O1, O1a, O1b: Grass or herbaceous vegetation
# M1, M1M2, M2, M2_25, M2_35, M2_50, M2_65: Mixedwood or transitional vegetation
# bog: Wetland areas with peatland vegetation
# water: Water bodies
# urban: Urban areas
# non_fuel: Areas with no significant vegetation (rock, gravel)
# low_veg: Areas with low vegetation (possibly recently burned or cleared)
# farm: Agricultural areas
# D2: Deadfall or downed wood
# Group by fuel and summarize
fuel_counts <- hotspots_peak %>%
group_by(fuel) %>%
summarise(count = n()) %>%
arrange(desc(count))
# Print the fuel counts
print(fuel_counts)
## # A tibble: 27 × 2
## fuel count
## <chr> <int>
## 1 C2 566652
## 2 C3 383781
## 3 D1 187812
## 4 C7 149801
## 5 C5 71006
## 6 D2 68185
## 7 M2_65 60189
## 8 C1 43814
## 9 M2_50 41250
## 10 O1b 37656
## # ℹ 17 more rows
# Create a bar plot for the fuel column
ggplot(hotspots_peak, aes(x = fuel)) +
geom_bar(fill = "skyblue", color = "black", alpha = 0.7) +
labs(title = "Distribution of Fuel Types", x = "Fuel Type", y = "Count") +
theme_minimal() +
scale_y_continuous(labels = scales::comma) +
theme(axis.text.x = element_text(angle = 45, hjust = 1))

The dataset shows a variety of fuel types, with “C2” (566,651
records) and “C3” (383,781 records) being the most common. Other
significant fuel types include “D1” (187,803 records) and “C7” (149,801
records).
“ros” (Rate of Spread)
# The predicted speed of the fire at the front or head of the fire (where the fire moves fastest),and takes into account both crowning and spotting.
# The scale ranges from 0 to over 20 m/min in the dataset.
ggplot(hotspots_peak, aes(x = ros)) +
geom_histogram(binwidth = 1, fill = "steelblue", color = "black", alpha = 0.7) +
labs(title = "Distribution Rate of Spread at Fire Hotspots",
x = "ROS (m/min)",
y = "Frequency") +
scale_y_continuous(labels = comma) +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5, size = 15),
axis.title.x = element_text(size = 12),
axis.title.y = element_text(size = 12),
axis.text.x = element_text(size = 10),
axis.text.y = element_text(size = 10))

ggplot(hotspots_peak, aes(x = factor(year), y = ros)) +
geom_boxplot(fill = "steelblue", color = "black", alpha = 0.7) +
labs(title = "Rate of Spread Distribution Across Years",
x = "Year",
y = "ROS (m/min)") +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5, size = 15),
axis.title.x = element_text(size = 12),
axis.title.y = element_text(size = 12),
axis.text.x = element_text(size = 10, angle = 45, hjust = 1),
axis.text.y = element_text(size = 10))

The plot shows that the ROS can vary significantly year to year,
with some years experiencing more extreme fire spread
conditions.
# Scatter plot for average ROS vs average ISI
ggplot(monthly_avg, aes(x = avg_isi, y = avg_ros)) +
geom_point(alpha = 0.7) +
geom_smooth(method = "lm", se = FALSE, color = "blue") +
labs(title = "Average ROS vs Average ISI",
x = "Average ISI",
y = "Average ROS") +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

# Scatter plot for average ROS vs average BUI
ggplot(monthly_avg, aes(x = avg_bui, y = avg_ros)) +
geom_point(alpha = 0.7) +
geom_smooth(method = "lm", se = FALSE, color = "red") +
labs(title = "Average ROS vs Average BUI",
x = "Average BUI",
y = "Average ROS") +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

# Scatter plot for average ROS vs average FWI
ggplot(monthly_avg, aes(x = avg_fwi, y = avg_ros)) +
geom_point(alpha = 0.7) +
geom_smooth(method = "lm", se = FALSE, color = "green") +
labs(title = "Average ROS vs Average FWI",
x = "Average FWI",
y = "Average ROS") +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

The scatter plots for ROS vs FWI, ROS vs BUI, and ROS vs ISI all
show a clear upward trend.This means that as the values of FWI, BUI, and
ISI increase, the Rate of Spread also increases. Higher values indicate
more severe fire conditions,more available fuel, and faster initial fire
spread. This means the fire spreads faster.
Correlation matrix between ROS, ISI, BUI and FWI
# Calculate correlations between ROS, ISI, BUI, and FWI
correlation_ros_indices <- hotspots_peak %>%
select(ros, isi, bui, fwi) %>%
cor(use = "complete.obs")
# Print the correlation matrix
print(correlation_ros_indices)
## ros isi bui fwi
## ros 1.0000000 0.5868442 0.2678373 0.5416872
## isi 0.5868442 1.0000000 0.4671475 0.9297335
## bui 0.2678373 0.4671475 1.0000000 0.7159426
## fwi 0.5416872 0.9297335 0.7159426 1.0000000
# Plot the correlation matrix
corrplot(correlation_ros_indices, method = "circle", type = "lower",
tl.col = "black", tl.srt = 45, title = "Correlation Matrix of ROS and Other Indices",
mar = c(0, 0, 1, 0))

The matrix shows strong positive correlations between ROS and
ISI, ROS and FWI,and a moderate positive correlation between ROS and
BUI.
“sfc” (Surface Fuel Consumption)
# SFC represents the amount of fuel consumed by surface fires,
# including organic soil (duff), surface litter, dead and downed woody debris.
# Calculated using indices of the FWI System (BUI and DC).
print(monthly_avg)
## # A tibble: 60 × 17
## year month avg_temp avg_rh avg_ws avg_pcp avg_ffmc avg_dmc avg_dc avg_isi
## <dbl> <ord> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 2014 May 16.0 29.8 9.97 1.00 87.4 28.8 79.1 8.24
## 2 2014 Jun 20.4 27.8 9.62 0.618 89.8 58.6 177. 9.55
## 3 2014 Jul 26.9 24.4 9.80 0.0342 94.3 83.5 384. 13.8
## 4 2014 Aug 25.1 31.6 7.86 0.157 91.9 99.7 534. 9.29
## 5 2014 Sep 21.7 32.7 8.11 0.147 90.4 71.9 672. 8.75
## 6 2014 Oct 8.93 73.4 8.02 2.38 49.0 7.72 403. 0.707
## 7 2015 May 18.8 30.8 10.5 0.0850 91.6 34.7 102. 9.87
## 8 2015 Jun 23.3 34.5 10.8 0.284 90.0 62.5 329. 8.33
## 9 2015 Jul 24.2 34.8 11.0 0.401 91.2 83.5 408. 9.60
## 10 2015 Aug 22.5 35.1 9.94 0.124 91.5 102. 532. 9.88
## # ℹ 50 more rows
## # ℹ 7 more variables: avg_sfc <dbl>, avg_tfc <dbl>, avg_bfc <dbl>,
## # avg_hfi <dbl>, avg_bui <dbl>, avg_fwi <dbl>, avg_ros <dbl>
ggplot(hotspots_peak, aes(x = sfc)) +
geom_histogram(binwidth = 0.5, fill = "steelblue", color = "black") +
labs(title = "Distribution of Surface Fuel Consumption (SFC)",
x = "SFC (kg/m²)",
y = "Frequency") +
theme_minimal() +
scale_y_continuous(labels = comma)

“tfc” (Total Fuel Consumption)
# TFC is the total amount of fuel consumed by both surface and crown fires.
# It includes all components in SFC plus the consumption of overstory fuels (canopy).
# Estimates the overall carbon emissions from a fire event.
# Range varies widely based on the intensity and spread of the fire, as well as the initial fuel load and moisture content.
ggplot(hotspots_peak, aes(x = tfc)) +
geom_histogram(binwidth = 0.5, fill = "steelblue", color = "black") +
labs(title = "Distribution of Total Fuel Consumption (TFC)",
x = "TFC (kg/m²)",
y = "Frequency") +
theme_minimal() +
scale_y_continuous(labels = comma)

“bfc” (Bonfire Fuel Consumption)
# BFC refers to a specific component of the BORFIRE model focused on estimating carbon emissions from boreal forest fires.
summary(hotspots_peak$bfc)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0 3 4 139 5 45488500 734414
# Explore huge bfc values
huge_bfc <- hotspots_peak %>%
filter(bfc > 1000)
# Print the resulting data frame to inspect the entries with huge bfc values
huge_bfc %>%
group_by(year, month) %>%
summarise(n_entries = n()) %>%
arrange(year, month)
## `summarise()` has grouped output by 'year'. You can override using the
## `.groups` argument.
## # A tibble: 4 × 3
## # Groups: year [2]
## year month n_entries
## <dbl> <ord> <int>
## 1 2020 Aug 3
## 2 2020 Oct 87
## 3 2021 Sep 8
## 4 2021 Oct 102
summary(huge_bfc$bfc)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1004 2103 3335 673558 19825 45488500
# Remove NA values from BFC data
hotspots_peak_BFC <- hotspots_peak %>%
filter(!is.na(bfc))
# Calculate IQR for BFC
IQR_bfc <- IQR(hotspots_peak_BFC$bfc, na.rm = TRUE)
Q1_bfc <- quantile(hotspots_peak_BFC$bfc, 0.25, na.rm = TRUE)
Q3_bfc <- quantile(hotspots_peak_BFC$bfc, 0.75, na.rm = TRUE)
# Define lower and upper bounds for outliers
upper_bound_bfc <- Q3_bfc + 1.5 * IQR_bfc
# Filter out the outliers
hotspots_peak_BFC <- hotspots_peak_BFC %>%
filter(bfc <= upper_bound_bfc)
# Summary of filtered data
summary(hotspots_peak_BFC$bfc)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 3.265 4.057 3.743 4.659 7.274
# Plot Histogram of BFC Data Without Handling Outliers
ggplot(hotspots_peak_BFC, aes(x = bfc)) +
geom_histogram(binwidth = 0.5, fill = "steelblue", color = "black") +
labs(title = "Distribution of Boreal Fire Consumption (BFC)",
x = "BFC",
y = "Frequency") +
theme_minimal() +
scale_y_continuous(labels = scales::comma)

“hfi” (Head Fire Intensity)
# Measures the intensity or energy output of a fire at its front (head).
# HFI is measured in kilowatts per metre (kW/m) of the fire front and is calculated based on the Rate of Spread (ROS)
# and the Total Fuel Consumption (TFC).
# Low (0-500 kW/m): Fires are relatively easy to control and generally cause limited damage.
# Moderate (500-2000 kW/m): Fires can be challenging to control, requiring more resources and effort.
# High (2000-4000 kW/m): Fires are intense and difficult to manage, often requiring aerial firefighting resources.
# Very High (4000-10,000 kW/m): Fires are extremely intense and nearly impossible to control, posing significant risk to life and property.
# Extreme (10,000+ kW/m): Fires exhibit explosive behavior and can cause catastrophic damage.
# Plot histogram for HFI
ggplot(hotspots_peak, aes(x = hfi)) +
geom_histogram(binwidth = 1000, fill = "skyblue", color = "black", alpha = 0.7) + # Each bin represent a range of 1000 HFI units.
labs(title = "Distribution of HFI Values",
x = "HFI (kW/m)",
y = "Frequency") +
scale_y_continuous(labels = scales::comma) +
scale_x_continuous(breaks = seq(0, 75000, 10000)) +
theme_minimal()

The histogram shows the frequency distribution of HFI
values.Most HFI values are concentrated at the lower end of the scale,
as values increase there are fewer.
Very high HFI values are outliers they show occasional extremely
intense fires. While most fires are less intense, the few high-intensity
fires can be significant and impactful.
ggplot(hotspots_peak, aes(x = factor(year), y = hfi)) +
geom_boxplot(fill = "steelblue", color = "black", alpha = 0.7) +
labs(title = "HFI Distribution Across Years",
x = "Year",
y = "HFI (kW/m)") +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5, size = 15),
axis.title.x = element_text(size = 12),
axis.title.y = element_text(size = 12),
axis.text.x = element_text(size = 10, angle = 45, hjust = 1),
axis.text.y = element_text(size = 10))

Different years show a lot of differences in HFI values. For
example, 2014 and 2021 had higher peaks, meaning there was more intense
fire activity in those years.
2020 had lower HFI values, there were milder fire conditions or
better fire control efforts.The plots also show how HFI can change from
month to month within a single fire season.Weather conditions,
temperature, humidity, and wind speed, can affect how fires
behave.
Analyze the trends of HFI and the number of fire events.
# Maximum values for scaling
max_hfi <- max(monthly_data$avg_hfi, na.rm = TRUE)
# Plot HFI and Fire Counts
ggplot(monthly_data, aes(x = Date)) +
geom_line(aes(y = avg_hfi, color = "HFI"), size = 1) +
geom_bar(aes(y = norm_n_events * max_hfi, fill = "Fire Occurrences"), stat = "identity", color = "black", alpha = 0.6) +
scale_y_continuous(
name = "HFI",
sec.axis = sec_axis(~ . * 1000 / max_hfi * max(monthly_data$n_events) / 1000,
name = "Number of Fire Occurrences", labels = comma)) +
scale_x_date(date_labels = "%Y", date_breaks = "1 year") +
labs(title = "Monthly Trends of HFI and Fire Occurrences",
x = "Year",
color = "Index",
fill = "Index") +
scale_color_manual(values = c("HFI" = "lightblue")) +
scale_fill_manual(values = c("Fire Occurrences" = "red")) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))

The HFI index shows notable peaks, particularly in the years
2015, 2018, and 2019, showing periods with high fire intensity.These
peaks suggest severe fire conditions during these years.
The number of fire events is notably higher in the years with
higher HFI values, such as 2018 and 2019.
“cfb” (Crown Fraction Burned)
# CFB is the predicted fraction of the tree crowns consumed by the fire.
# 0 to 100 with a mean of 33
# It ranges from 0 to 100%,
# where 0% means no crown fire activity and 100% indicates
# that the entire tree crowns in the area have been burned.
ggplot(hotspots_peak, aes(x = cfb)) +
geom_histogram(binwidth = 1, fill = "steelblue", color = "black", alpha = 0.7) +
labs(title = "Distribution of CFB at Fire Hotspots",
x = "CFB",
y = "Frequency") +
scale_y_continuous(labels = comma) +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5, size = 15),
axis.title.x = element_text(size = 12),
axis.title.y = element_text(size = 12),
axis.text.x = element_text(size = 10),
axis.text.y = element_text(size = 10))
## Warning: Removed 44273 rows containing non-finite outside the scale range
## (`stat_bin()`).

This histogram is dominated by zero values, making it difficult
to identify clear trends.
# Count missing values in the CFB column
sum(is.na(hotspots_peak$cfb))
## [1] 44273
# Identify years with the most missing values
missing_cfb <- hotspots_peak %>%
group_by(year) %>%
summarise(
total_count = n(),
missing_count = sum(is.na(cfb)),
missing_percentage = (missing_count / total_count) * 100
) %>%
arrange(desc(missing_percentage))
missing_cfb
## # A tibble: 10 × 4
## year total_count missing_count missing_percentage
## <dbl> <int> <int> <dbl>
## 1 2015 44579 25012 56.1
## 2 2016 11094 5197 46.8
## 3 2014 36152 14064 38.9
## 4 2017 270146 0 0
## 5 2018 365355 0 0
## 6 2019 28534 0 0
## 7 2020 11507 0 0
## 8 2021 232832 0 0
## 9 2022 46587 0 0
## 10 2023 687827 0 0
2015 has the highest percentage of missing values at 56.1%. 2017
has 54.9%. 2016 has 46.8%. 2018-2024 and 2014 have no missing
values
zero_cfb <- hotspots_peak %>%
group_by(year) %>%
summarise(
total_count = n(),
zero_count = sum(cfb == 0, na.rm = TRUE),
zero_percentage = (zero_count / total_count) * 100
) %>%
arrange(desc(zero_percentage))
zero_cfb
## # A tibble: 10 × 4
## year total_count zero_count zero_percentage
## <dbl> <int> <int> <dbl>
## 1 2019 28534 26443 92.7
## 2 2022 46587 38289 82.2
## 3 2020 11507 8984 78.1
## 4 2023 687827 446669 64.9
## 5 2021 232832 131123 56.3
## 6 2017 270146 131047 48.5
## 7 2018 365355 112988 30.9
## 8 2014 36152 17 0.0470
## 9 2015 44579 17 0.0381
## 10 2016 11094 2 0.0180
2019 has the highest percentage of zero values at 92.7%. 2020,
2021, 2022, and 2023 also have high values ranging from 65.4% to 77.8%.
2015-2018, and 2024 have almost no zero values.
Visualization of CFB after removing NA value
# Filter out zero and na values to check the CFB values when it was properly recorded.
hotspots_peak_CFB <- hotspots_peak %>%
filter(cfb > 0 & !is.na(cfb))
# Create the histogram for CFB
ggplot(hotspots_peak_CFB, aes(x = cfb)) +
geom_histogram(binwidth = 1, fill = "steelblue", color = "black") +
labs(title = "Distribution of Crown Fraction Burned (CFB) at Fire Hotspots",
x = "CFB",
y = "Frequency") +
theme_minimal() +
scale_y_continuous(labels = scales::comma)

“age” (Age of the Tree)
# The age is indicated the age of tree were burned
# Count missing values in the Age column
sum(is.na(hotspots_peak$age))
## [1] 1350060
# Identify years with the most missing values
missing_age <- hotspots_peak %>%
group_by(year) %>%
summarise(
total_count = n(),
missing_count = sum(is.na(age)),
missing_percentage = (missing_count / total_count) * 100
) %>%
arrange(desc(missing_percentage))
missing_age
## # A tibble: 10 × 4
## year total_count missing_count missing_percentage
## <dbl> <int> <int> <dbl>
## 1 2019 28534 28534 100
## 2 2020 11507 11507 100
## 3 2021 232832 232832 100
## 4 2022 46587 46587 100
## 5 2023 687827 687827 100
## 6 2018 365355 342773 93.8
## 7 2014 36152 0 0
## 8 2015 44579 0 0
## 9 2016 11094 0 0
## 10 2017 270146 0 0
zero_age <- hotspots_peak %>%
group_by(year) %>%
summarise(
total_count = n(),
zero_count = sum(age == 0, na.rm = TRUE),
zero_percentage = (zero_count / total_count) * 100
) %>%
arrange(desc(zero_percentage))
zero_age
## # A tibble: 10 × 4
## year total_count zero_count zero_percentage
## <dbl> <int> <int> <dbl>
## 1 2017 270146 255509 94.6
## 2 2014 36152 0 0
## 3 2015 44579 0 0
## 4 2016 11094 0 0
## 5 2018 365355 0 0
## 6 2019 28534 0 0
## 7 2020 11507 0 0
## 8 2021 232832 0 0
## 9 2022 46587 0 0
## 10 2023 687827 0 0
The years with valid amount of observations are 2014 2015 2016.
The variable almost doesnt have meaningfull values in 2017 and 2018. The
more recent hotspots have stopped recording this variable
altogether.
Visualization of age variable
hotspots_peak_age <- hotspots_peak %>%
filter(age > 0 & !is.na(age))
# Create the histogram for Age
ggplot(hotspots_peak_age, aes(x = age)) +
geom_histogram(binwidth = 100, fill = "steelblue", color = "black") +
labs(title = "Distribution of Tree Age at Fire Hotspots",
x = "Age",
y = "Frequency") +
theme_minimal() +
scale_y_continuous(labels = scales::comma)

“est area” (Estimated Area Burned )
# approximate burned area based on historical average area burned per hotspot by agency and fuel type
# Count missing values in the estarea
sum(is.na(hotspots_peak$estarea))
## [1] 1223385
# Identify years with the most missing values
missing_estarea <- hotspots_peak %>%
group_by(year) %>%
summarise(
total_count = n(),
missing_count = sum(is.na(estarea)),
missing_percentage = (missing_count / total_count) * 100
) %>%
arrange(desc(year))
missing_estarea
## # A tibble: 10 × 4
## year total_count missing_count missing_percentage
## <dbl> <int> <int> <dbl>
## 1 2023 687827 687827 100
## 2 2022 46587 46587 100
## 3 2021 232832 232832 100
## 4 2020 11507 9827 85.4
## 5 2019 28534 18305 64.2
## 6 2018 365355 200009 54.7
## 7 2017 270146 0 0
## 8 2016 11094 5880 53.0
## 9 2015 44579 22118 49.6
## 10 2014 36152 0 0
2023 2022 and 2021 do not have this variable. 2015 2016 2018
2019 has around 50% missing values 2020 85 % missing values
Summary of zero value in estarea variable
zero_estarea <- hotspots_peak %>%
group_by(year) %>%
summarise(
total_count = n(),
zero_count = sum(estarea == 0, na.rm = TRUE),
zero_percentage = (zero_count / total_count) * 100
) %>%
arrange(desc(year))
zero_estarea
## # A tibble: 10 × 4
## year total_count zero_count zero_percentage
## <dbl> <int> <int> <dbl>
## 1 2023 687827 0 0
## 2 2022 46587 0 0
## 3 2021 232832 0 0
## 4 2020 11507 9 0.0782
## 5 2019 28534 58 0.203
## 6 2018 365355 0 0
## 7 2017 270146 140895 52.2
## 8 2016 11094 0 0
## 9 2015 44579 0 0
## 10 2014 36152 0 0
Additionally to the missing values already described, 2017 has
almost 50% zero values The only year with values present is 2014 The
recent hotspots have stopped recording this variable
altogether.
“polyid” variable
# Count missing values in the estarea
sum(is.na(hotspots_peak$polyid))
## [1] 1384975
# Identify years with the most missing values
missing_polyid <- hotspots_peak %>%
group_by(year) %>%
summarise(
total_count = n(),
missing_count = sum(is.na(polyid)),
missing_percentage = (missing_count / total_count) * 100
) %>%
arrange(desc(year))
missing_polyid
## # A tibble: 10 × 4
## year total_count missing_count missing_percentage
## <dbl> <int> <int> <dbl>
## 1 2023 687827 687827 100
## 2 2022 46587 46587 100
## 3 2021 232832 232832 100
## 4 2020 11507 11507 100
## 5 2019 28534 28534 100
## 6 2018 365355 365355 100
## 7 2017 270146 0 0
## 8 2016 11094 3193 28.8
## 9 2015 44579 7984 17.9
## 10 2014 36152 1156 3.20
There is no polyid variable in year 2018-2023.
zero_polyid <- hotspots_peak %>%
group_by(year) %>%
summarise(
total_count = n(),
zero_count = sum(polyid == 0, na.rm = TRUE),
zero_percentage = (zero_count / total_count) * 100
) %>%
arrange(desc(year))
zero_polyid
## # A tibble: 10 × 4
## year total_count zero_count zero_percentage
## <dbl> <int> <int> <dbl>
## 1 2023 687827 0 0
## 2 2022 46587 0 0
## 3 2021 232832 0 0
## 4 2020 11507 0 0
## 5 2019 28534 0 0
## 6 2018 365355 0 0
## 7 2017 270146 267596 99.1
## 8 2016 11094 0 0
## 9 2015 44579 0 0
## 10 2014 36152 0 0
Additionally 2017 has almost all zero value.
Initially the variable was used to identify specific event, but
recently the reporting process has changed Only meaningful information
in 2014 2015 and 2016, it may be not suitable for meaningful analyses
for the 10 year period
“pcuring” (Percent Curing)
# This indicates the proportion of grass and other fuels (non-woody) that are in a cured (dried) state, ready to burn.
# For example, if pcuring is 80%, it means that 80% of the vegetation is dead or dry enough to ignite and sustain fire.
# Count missing values in the pcuring
sum(is.na(hotspots_peak$pcuring))
## [1] 687686
# Identify years with the most missing values
missing_pcuring <- hotspots_peak %>%
group_by(year) %>%
summarise(
total_count = n(),
missing_count = sum(is.na(pcuring)),
missing_percentage = (missing_count / total_count) * 100
) %>%
arrange(desc(year))
missing_pcuring
## # A tibble: 10 × 4
## year total_count missing_count missing_percentage
## <dbl> <int> <int> <dbl>
## 1 2023 687827 687686 100.
## 2 2022 46587 0 0
## 3 2021 232832 0 0
## 4 2020 11507 0 0
## 5 2019 28534 0 0
## 6 2018 365355 0 0
## 7 2017 270146 0 0
## 8 2016 11094 0 0
## 9 2015 44579 0 0
## 10 2014 36152 0 0
2023 does not have this variable.
Analyse pcuring without year of 2023
# Exclude the year 2023 where pcuring is missing
hotspots_peak_pcuring <- hotspots_peak %>%
filter(!(year == 2023)) %>%
filter(!is.na(pcuring))
# Check the summary of the filtered data
summary(hotspots_peak_pcuring$pcuring)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 21.00 36.00 37.37 50.00 125.00
# Checking for values greater than 100%
pcuring_above_100 <- hotspots_peak %>% filter(pcuring > 100)
summary(pcuring_above_100)
## lat lon rep_date
## Min. :50.27 Min. :-134.4 Min. :2018-08-01 03:10:00
## 1st Qu.:55.20 1st Qu.:-132.1 1st Qu.:2018-08-09 02:21:15
## Median :57.27 Median :-131.2 Median :2018-08-17 05:12:30
## Mean :56.06 Mean :-128.5 Mean :2018-11-02 22:40:00
## 3rd Qu.:58.12 3rd Qu.:-127.6 3rd Qu.:2018-11-11 01:31:15
## Max. :59.44 Max. :-117.2 Max. :2019-07-10 05:05:00
##
## uid source sensor satellite
## Min. :3033797 Length:4 Length:4 Length:4
## 1st Qu.:4125436 Class :character Class :character Class :character
## Median :5217074 Mode :character Mode :character Mode :character
## Mean :4527720
## 3rd Qu.:5274682
## Max. :5332290
## NA's :1
## agency temp rh ws
## Length:4 Min. :13.92 Min. :39.00 Min. :2.757
## Class :character 1st Qu.:15.98 1st Qu.:44.25 1st Qu.:2.812
## Mode :character Median :18.76 Median :49.00 Median :2.924
## Mean :18.43 Mean :48.25 Mean :3.022
## 3rd Qu.:21.21 3rd Qu.:53.00 3rd Qu.:3.134
## Max. :22.29 Max. :56.00 Max. :3.482
##
## wd pcp ffmc dmc
## Min. :237.0 Min. :0.00000 Min. :83.92 Min. : 32.17
## 1st Qu.:243.0 1st Qu.:0.03375 1st Qu.:86.59 1st Qu.: 45.81
## Median :248.5 Median :0.20450 Median :88.80 Median : 70.22
## Mean :259.8 Mean :0.30175 Mean :88.02 Mean : 71.45
## 3rd Qu.:265.2 3rd Qu.:0.47250 3rd Qu.:90.23 3rd Qu.: 95.86
## Max. :305.0 Max. :0.79800 Max. :90.54 Max. :113.18
##
## dc isi bui fwi
## Min. :301.9 Min. :3.149 Min. : 50.8 Min. :16.05
## 1st Qu.:346.0 1st Qu.:4.528 1st Qu.: 68.7 1st Qu.:17.26
## Median :466.4 Median :6.093 Median :108.0 Median :20.31
## Mean :513.7 Mean :6.011 Mean :104.6 Mean :20.36
## 3rd Qu.:634.1 3rd Qu.:7.575 3rd Qu.:143.9 3rd Qu.:23.41
## Max. :820.0 Max. :8.711 Max. :151.4 Max. :24.77
##
## fuel ros sfc tfc
## Length:4 Min. : 0.000 Min. :0.0000 Min. :0.0000
## Class :character 1st Qu.: 0.000 1st Qu.:0.0000 1st Qu.:0.0000
## Mode :character Median : 0.218 Median :0.1750 Median :0.1750
## Mean : 2.609 Mean :0.3669 Mean :0.3669
## 3rd Qu.: 2.827 3rd Qu.:0.5419 3rd Qu.:0.5419
## Max. :10.000 Max. :1.1174 Max. :1.1174
##
## bfc hfi cfb age estarea
## Min. :0.0000 Min. : 0 Min. :0 Min. :255 Min. :7.516
## 1st Qu.:0.0000 1st Qu.: 0 1st Qu.:0 1st Qu.:255 1st Qu.:7.719
## Median :0.1578 Median : 73 Median :0 Median :255 Median :7.923
## Mean :0.8912 Mean : 299 Mean :0 Mean :255 Mean :7.923
## 3rd Qu.:1.0490 3rd Qu.: 372 3rd Qu.:0 3rd Qu.:255 3rd Qu.:8.126
## Max. :3.2490 Max. :1050 Max. :0 Max. :255 Max. :8.329
## NA's :3 NA's :2
## polyid pcuring cfactor greenup elev
## Min. : NA Min. :104.0 Min. :1 Min. :1 Min. :1127
## 1st Qu.: NA 1st Qu.:113.8 1st Qu.:1 1st Qu.:1 1st Qu.:1654
## Median : NA Median :118.0 Median :1 Median :1 Median :1898
## Mean :NaN Mean :116.2 Mean :1 Mean :1 Mean :1913
## 3rd Qu.: NA 3rd Qu.:120.5 3rd Qu.:1 3rd Qu.:1 3rd Qu.:2157
## Max. : NA Max. :125.0 Max. :1 Max. :1 Max. :2727
## NA's :4 NA's :1
## cfl tfc0 sfl ecozone sfc0
## Min. :0 Min. :0.0000 Min. :-1 Min. :12 Min. :0.35
## 1st Qu.:0 1st Qu.:0.1750 1st Qu.:-1 1st Qu.:12 1st Qu.:0.35
## Median :0 Median :0.3500 Median :-1 Median :12 Median :0.35
## Mean :0 Mean :0.4891 Mean :-1 Mean :12 Mean :0.35
## 3rd Qu.:0 3rd Qu.:0.7337 3rd Qu.:-1 3rd Qu.:12 3rd Qu.:0.35
## Max. :0 Max. :1.1174 Max. :-1 Max. :12 Max. :0.35
## NA's :1 NA's :3 NA's :3 NA's :3
## cbh year month event_cluster
## Min. :-1 Min. :2018 Aug :3 Min. : 0
## 1st Qu.:-1 1st Qu.:2018 Jul :1 1st Qu.: 0
## Median :-1 Median :2018 Jan :0 Median : 4659
## Mean :-1 Mean :2018 Feb :0 Mean : 4970
## 3rd Qu.:-1 3rd Qu.:2018 Mar :0 3rd Qu.: 9629
## Max. :-1 Max. :2019 Apr :0 Max. :10563
## NA's :3 (Other):0
4 entries with values above 100% - out of range values, possible
mistake in reporting
Visualization of curing percentage
zero_pcuring <- hotspots_peak %>%
group_by(year) %>%
summarise(
total_count = n(),
zero_count = sum(pcuring == 0, na.rm = TRUE),
zero_percentage = (zero_count / total_count) * 100
) %>%
arrange(desc(year))
zero_pcuring
## # A tibble: 10 × 4
## year total_count zero_count zero_percentage
## <dbl> <int> <int> <dbl>
## 1 2023 687827 0 0
## 2 2022 46587 15 0.0322
## 3 2021 232832 5266 2.26
## 4 2020 11507 12 0.104
## 5 2019 28534 342 1.20
## 6 2018 365355 51623 14.1
## 7 2017 270146 3008 1.11
## 8 2016 11094 417 3.76
## 9 2015 44579 783 1.76
## 10 2014 36152 773 2.14
sum(zero_pcuring$zero_count)
## [1] 62239
# 62239 zero values
Distribution of curing percentage
# Create the histogram for pcuring
ggplot(hotspots_peak_pcuring, aes(x = pcuring)) +
geom_histogram(binwidth = 1, fill = "skyblue", color = "black", alpha = 0.7) +
labs(title = "Distribution of Curing Percentage (pcuring) Values",
x = "Curing Percentage (pcuring)",
y = "Frequency") +
scale_y_continuous(labels = scales::comma) +
theme_minimal()

# Create the boxplot for pcuring across years
ggplot(hotspots_peak_pcuring, aes(x = factor(year), y = pcuring)) +
geom_boxplot(fill = "steelblue", color = "black", alpha = 0.7) +
labs(title = "Distribution of Curing Percentage (pcuring) Across Years",
x = "Year",
y = "Curing Percentage (pcuring)") +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5, size = 15),
axis.title.x = element_text(size = 12),
axis.title.y = element_text(size = 12),
axis.text.x = element_text(size = 10, angle = 45, hjust = 1),
axis.text.y = element_text(size = 10))

The median pcuring values for these years range between 30% and
50%.There are many entries with pcuring at 0%, possibly indicating
periods with no drying or data recording issues.
“cfactor”(Curing Factor)
# The amount of curing, or drying, of vegetation.
# Count missing values in the cfactor
sum(is.na(hotspots_peak$cfactor))
## [1] 1007287
# Identify years with the most missing values
missing_cfactor <- hotspots_peak %>%
group_by(year) %>%
summarise(
total_count = n(),
missing_count = sum(is.na(cfactor)),
missing_percentage = (missing_count / total_count) * 100
) %>%
arrange(desc(year))
missing_cfactor
## # A tibble: 10 × 4
## year total_count missing_count missing_percentage
## <dbl> <int> <int> <dbl>
## 1 2023 687827 687827 100
## 2 2022 46587 46587 100
## 3 2021 232832 232832 100
## 4 2020 11507 11507 100
## 5 2019 28534 28534 100
## 6 2018 365355 0 0
## 7 2017 270146 0 0
## 8 2016 11094 0 0
## 9 2015 44579 0 0
## 10 2014 36152 0 0
2019 - 2023 does not have this variable.
Analyse of Curing factor after years removal
# Exclude the years where cfactor is missing
hotspots_peak_cfactor <- hotspots_peak %>%
filter((year < 2019)) %>%
filter(!is.na(cfactor))
# Check the summary of the filtered data
summary(hotspots_peak_cfactor$cfactor)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0100 0.0400 0.1306 0.1200 1.0000
Represents the curing factor as a proportion rather than a
percentage.
Visualization of Curing Factor
# Create the histogram for cfactor
ggplot(hotspots_peak_cfactor, aes(x = cfactor)) +
geom_histogram(binwidth = 0.1, fill = "skyblue", color = "black", alpha = 0.7) +
labs(title = "Distribution of Curing Factor (cfactor) Values",
x = "Curing Factor (cfactor)",
y = "Frequency") +
scale_y_continuous(labels = scales::comma) +
theme_minimal()

# Create the boxplot for cfactor across years
ggplot(hotspots_peak_cfactor, aes(x = factor(year), y = cfactor)) +
geom_boxplot(fill = "steelblue", color = "black", alpha = 0.7) +
labs(title = "Distribution of Curing Factor (cfactor) Across Years",
x = "Year",
y = "Curing Factor (cfactor)") +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5, size = 15),
axis.title.x = element_text(size = 12),
axis.title.y = element_text(size = 12),
axis.text.x = element_text(size = 10, angle = 45, hjust = 1),
axis.text.y = element_text(size = 10))

The variable has stopped being recorded recently. It may be not
suitable for meaningful analyses for the 10 year period
“greenup” (phenological state of deciduous trees)
# phenological state of deciduous trees (0=leafless, 1=green)
sum(is.na(hotspots_peak$greenup))
## [1] 687686
# Identify years with the most missing values
missing_greenup <- hotspots_peak %>%
group_by(year) %>%
summarise(
total_count = n(),
missing_count = sum(is.na(greenup)),
missing_percentage = (missing_count / total_count) * 100
) %>%
arrange(desc(year))
missing_greenup
## # A tibble: 10 × 4
## year total_count missing_count missing_percentage
## <dbl> <int> <int> <dbl>
## 1 2023 687827 687686 100.
## 2 2022 46587 0 0
## 3 2021 232832 0 0
## 4 2020 11507 0 0
## 5 2019 28534 0 0
## 6 2018 365355 0 0
## 7 2017 270146 0 0
## 8 2016 11094 0 0
## 9 2015 44579 0 0
## 10 2014 36152 0 0
2023 does not have this variable.
# Count the number of observations in each greenup state
greenup_count <- hotspots_peak %>%
group_by(greenup) %>%
summarise(count = n())
greenup_count
## # A tibble: 4 × 2
## greenup count
## <dbl> <int>
## 1 -1 4
## 2 0 412757
## 3 1 634166
## 4 NA 687686
There are invalid values present in the dataset. This variable
is only 0 or 1
Summary of greenup variable
# Filter out invalid values in the greenup variable
hotspots_peak_greenup <- hotspots_peak %>%
filter(greenup %in% c(0, 1))
# Check the summary of the filtered data
summary(hotspots_peak_greenup$greenup)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 1.0000 0.6057 1.0000 1.0000
greenup_by_year <- hotspots_peak_greenup %>%
group_by(year, greenup) %>%
summarise(count = n(), .groups = 'drop') %>%
arrange(desc(year), greenup)
Visualization of Greenup variable
# Visualize the count of observations in each greenup state
ggplot(greenup_by_year, aes(x = factor(greenup), y = count, fill = factor(greenup))) +
geom_bar(stat = "identity") +
labs(title = "Count of Observations by Greenup State",
x = "Greenup State",
y = "Count",
fill = "Greenup State") +
scale_fill_manual(values = c("0" = "bisque3", "1" = "lightgreen")) +
theme_minimal()+
scale_y_continuous(labels = scales::comma)

There are significantly more observations for greenup state 1
(trees with green leaves) compared to state 0 (leafless trees).The
greenup state alone may not provide significant insights into fire
behavior or risks.
“elev” (Elevation above sea level)
# BC's elevation ranges from sea level to approximately 4000 meters,
# with the highest peak being Mount Fairweather at 4663 meters.
# The elev can influencing local weather pattern, fuel moisture
# Check for missing values
sum(is.na(hotspots_peak$elev))
## [1] 36148
A low percentage of missing values
range(hotspots_peak$elev, na.rm = TRUE)
## [1] -1 3129
-1 3129 While there is one instance of a below sea level
elevation, the range is valid for BC
# Identify and analyze outliers
elev_outliers <- hotspots_peak %>%
filter(elev < 0)
print(elev_outliers)
## # A tibble: 314 × 42
## lat lon rep_date uid source sensor satellite agency temp
## <dbl> <dbl> <dttm> <dbl> <chr> <chr> <chr> <chr> <dbl>
## 1 49.0 -120. 2015-07-05 11:00:00 11188377 NASA MODIS Aqua BC 33.4
## 2 49.1 -125. 2015-10-20 21:06:00 11787219 NOAA AVHRR NOAA-19 BC 13.8
## 3 54.0 -129. 2015-07-01 20:44:00 11073822 USFS MODIS Terra BC 21.2
## 4 54.0 -129. 2015-07-01 20:44:00 11073821 USFS MODIS Terra BC 21.2
## 5 49.0 -124. 2015-05-17 21:25:00 10692271 NASA MODIS Aqua BC 15.9
## 6 49 -120. 2015-07-03 14:29:00 11137964 NOAA AVHRR NOAA-15 BC 35.8
## 7 51.0 -127. 2015-08-03 04:15:00 11470837 NOAA AVHRR METOP-A BC 17.6
## 8 49.0 -120. 2015-07-03 09:20:00 11159191 USFS IBAND S-NPP BC 35.8
## 9 49.0 -120. 2015-07-03 09:20:00 11159192 USFS IBAND S-NPP BC 35.8
## 10 53.6 -132. 2015-10-12 19:49:00 11759426 USFS Lands… L8 BC 12.8
## # ℹ 304 more rows
## # ℹ 33 more variables: rh <dbl>, ws <dbl>, wd <dbl>, pcp <dbl>, ffmc <dbl>,
## # dmc <dbl>, dc <dbl>, isi <dbl>, bui <dbl>, fwi <dbl>, fuel <chr>,
## # ros <dbl>, sfc <dbl>, tfc <dbl>, bfc <dbl>, hfi <dbl>, cfb <dbl>,
## # age <dbl>, estarea <dbl>, polyid <dbl>, pcuring <dbl>, cfactor <dbl>,
## # greenup <dbl>, elev <dbl>, cfl <dbl>, tfc0 <dbl>, sfl <dbl>, ecozone <dbl>,
## # sfc0 <dbl>, cbh <dbl>, year <dbl>, month <ord>, event_cluster <int>
# Histogram of elevation values
ggplot(hotspots_peak, aes(x = elev)) +
geom_histogram(binwidth = 100, fill = "skyblue", color = "black") +
labs(title = "Distribution of Elevation Values",
x = "Elevation (meters)",
y = "Frequency") +
theme_minimal()
## Warning: Removed 36148 rows containing non-finite outside the scale range
## (`stat_bin()`).

Overall the distribution of elevation levels is normal for
British Columbia.
“ecozone” Ecozone in which hotspot is located
# Identify years with the most missing values
missing_ecozone <- hotspots_peak %>%
group_by(year) %>%
summarise(
total_count = n(),
missing_count = sum(is.na(ecozone)),
missing_percentage = (missing_count / total_count) * 100
) %>%
arrange(desc(year))
missing_ecozone
## # A tibble: 10 × 4
## year total_count missing_count missing_percentage
## <dbl> <int> <int> <dbl>
## 1 2023 687827 24 0.00349
## 2 2022 46587 0 0
## 3 2021 232832 0 0
## 4 2020 11507 4 0.0348
## 5 2019 28534 0 0
## 6 2018 365355 365355 100
## 7 2017 270146 270146 100
## 8 2016 11094 11094 100
## 9 2015 44579 44579 100
## 10 2014 36152 36152 100
For the years 2019-2023 there are only 24+4 missing
entries.
# Count the number of hotspots in each ecozone
ecozone_counts <- hotspots_peak %>% filter(year >= 2019) %>%
group_by(ecozone) %>%
summarise(count = n())
# Print the ecozone counts
print(ecozone_counts)
## # A tibble: 6 × 2
## ecozone count
## <dbl> <int>
## 1 4 321952
## 2 9 54214
## 3 12 40810
## 4 13 19171
## 5 14 571112
## 6 NA 28
# Convert ecozone to a factor
ecozone_counts$ecozone <- factor(ecozone_counts$ecozone, levels = c(14, 4, 9, 12, 13, "NA"))
# Define custom colors and labels for the selected ecozones (Statistics Canada site)
# https://www.statcan.gc.ca/en/subjects/standard/environment/elc/2017-map
custom_colors <- c("14" = "#B5D79F", "4" = "#989898", "9" = "#36C48E",
"12" = "#ACC32D", "13" = "#05734D", "NA" = "#000000")
custom_labels <- c("14" = "Montane Cordillera", "4" = "Taiga Plains", "9" = "Boreal Plains",
"12" = "Boreal Cordillera", "13" = "Pacific Maritime")
# Visualize the distribution of hotspots by ecozone with custom legend
ggplot(ecozone_counts, aes(x = reorder(ecozone, -count), y = count, fill = ecozone)) +
geom_bar(stat = "identity") +
labs(title = "Distribution of Hotspots by Ecozone (2019 onwards)",
x = "Ecozone",
y = "Number of Hotspots",
fill = "Ecozone") +
theme_minimal() +
scale_y_continuous(labels = scales::comma) +
scale_fill_manual(values = custom_colors, labels = custom_labels) +
theme(axis.text.x = element_text(angle = 45, hjust = 1),
legend.position = "right",
legend.title = element_text(size = 12),
legend.text = element_text(size = 10))

The plot shows distribution of ecozones in the hotspot
dataset.
The plot shows that the Montane Cordillera ecozone has the
highest number of hotspots, indicating it is the most fire-prone ecozone
in the dataset from 2019 onwards.
For the variables : cfl, sfl,tfc0, sfc0, cbh are not suitable
for analysis of the decade as data present only in recent
years